pkgs <- c("tidyverse","tidylog", "sp", "raster", "devtools", "RCurl", "sdmTMB", "viridis", "terra", "ncdf4", "chron", "ncdf4", "tidync", "patchwork", "conflicted")
if(length(setdiff(pkgs, rownames(installed.packages()))) > 0){
install.packages(setdiff(pkgs, rownames(installed.packages())), dependencies = T)
}
invisible(lapply(pkgs, library, character.only = T))
# Source code for map plots
devtools::source_url("https://raw.githubusercontent.com/VThunell/Lammska_cod-fr/main/R/functions/map-plot.R")
# Packages not on CRAN
# devtools::install_github("seananderson/ggsidekick") # not on CRAN
library(ggsidekick)
theme_set(theme_sleek())
# Set path
home <- here::here()
# Due to conflicting function names in tidyterra and dplyr, we give way for dplyr.
conflicts_prefer(dplyr::filter(),dplyr::mutate(), dplyr::between(), dplyr::summarise)Combine environmental data from SMHI and Copernicus
Introduction
This script takes netcdf files with spatiotemporal environmental data and tranform these to a suitable format for our analysis.
We add environmental information to the stomach observations (see 03-add-env-data-to-stomachs) and use these as covariates in the models and in prediction (see 04-make-pred-grid).
The data comes from… * Bottom oxygen, temperature and salinity is from spatiotemporal simulated data available from the Copernicus and the Swedish Meteorological and Hydrological Institute (SMHI). * From Copernicus, we use the Baltic sea reanalysis (for years 1993-2021) and the analysis and forecast (for 2022-2023) of Nemo for salinity and temperature and the biogeochemical model ERGOM. * From SMHI we used hindcast simulations (1961-2017) of the NEMO–SCOBI model (NEMO-Nordic coupled to the Swedish Coastal and Ocean Biogeochemical model SCOBI, https://doi.org/10.5194/bg-2023-116).
Below, the two data sets are referred to as Copernicus and hindcast.
Load libraries
1. NEMO hindcast data 1961-2017 (bottom oxygen, temperature and salinity)
The SMHI hindcast data covering the baltic and our variables of interest comes as one netdcf file per month (12 per year). Here we retrieve the info we need in each netcdf file and combine them into a dataframe.
netcdf files often has has lat and long as dimensions covering space (and someimes depth and time) but these don’t have that so that we can easily go from arrays to 2-dimensional matrices or data frames which are models rely on. Now lat and lon are instead 2d variables and the grid dimension (x, y) are just integer-valued (1). So we have to fiddle with them a bit (join in coordinates to the same file) to extract the space and depth for each time slice (month year).
# # Get the file paths for each netcdf
# path <- paste0(home, "/data/hindcastRunToSLU/20240701_DataDelivery_SMHI_SLU")
# filepaths <- list.files(path, full.names=TRUE)
#
# # turn filepath name ending into corresponding year and moth and check if any years do not contain 12 months
# as.data.frame(filepaths) |>
# mutate(year = as.factor(str_sub(filepaths, start = 99, end=102)),
# month = as.factor(str_sub(filepaths, start = 103, end=104))) |>
# summarise(n=n(), .by = year)
#
# f_envdat <- function(file) {
# a <- tidync(file) |>
# hyper_tibble()
#
# b <- tidync(file) |>
# activate( "D3,D2" ) |> # grid identifiers
# hyper_tibble()
#
# c <-left_join(a, b, by = c("x","y")) |>
# filter(between(nav_lat, 54, 60) & between(nav_lon, 12.5, 24.5), # reduce extent from coord in stomach data
# !vosaline == 0 ) |> # to filter out non-existing values, could also be oxy or votemper
# filter(depth == max(depth), .by= c(nav_lat,nav_lon)) |>
# mutate(year = str_sub(file, start = 99, end=102),
# month = str_sub(file, start = 103, end=104))
# }
#
# time <- Sys.time() # 35 min!!!
#
# hindenv_df <- filepaths |>
# map(\(x) f_envdat(x)) |>
# list_rbind()
#
# Sys.time() - time
#
# hindenv_df <- hindenv_df |>
# dplyr::select(-x,-y) |>
# rename(lat = nav_lat,
# lon = nav_lon,
# sal = vosaline,
# temp = votemper) |>
# mutate(month = as.integer(month),
# year = as.integer(year))
#
# saveRDS(hindenv_df, file = paste0(home, "/data/environment/hindcast_1961_2017.rds"))2. Copernicus reanalysis and forecast 1993-2024 (bottom oxygen, temperature and salinity)
# # Oxygen
#
# # Source:
# # forecast, cmems_mod_bal_bgc_anfc_P1M-m_1723620449355.nc
# # https://data.marine.copernicus.eu/product/BALTICSEA_ANALYSISFORECAST_BGC_003_007/download?dataset=cmems_mod_bal_bgc_anfc_P1M-m_202311
#
# # reanalysis, cmems_mod_bal_bgc_my_P1M-m_1723620645492.nc
# # https://data.marine.copernicus.eu/product/BALTICSEA_MULTIYEAR_BGC_003_012/download?dataset=cmems_mod_bal_bgc_my_P1M-m_202303
#
# # forecast
# oxy_df1_fore <- tidync(paste0(home, "/data/NEMO/cmems_mod_bal_bgc_anfc_P1M-m_1723620449355.nc")) |>
# hyper_tibble() |>
# mutate(date = as_datetime(time, origin = '1970-01-01'),
# month = month(date),
# day = day(date),
# year = year(date),
# month_year = paste(month, year, sep = "_")) |>
# mutate(oxy = mean(o2b, na.rm = TRUE), .by = c(month_year, latitude, longitude)) |>
# mutate(oxy = oxy * 0.0223909) # conversion factor from mmol/m3 to ?
# # the hindcast data is averaged by month, so we do the same here even though it seems like its only one obs per month.
#
# # reanalysis
# oxy_df1_rean <- tidync(paste0(home, "/data/NEMO/cmems_mod_bal_bgc_my_P1M-m_1723620645492.nc")) |>
# hyper_tibble() |>
# mutate(date = as_datetime(time, origin = '1970-01-01'),
# month = month(date),
# day = day(date),
# year = year(date),
# month_year = paste(month, year, sep = "_")) |>
# mutate(oxy = mean(o2b, na.rm = TRUE), .by = c(month_year, latitude, longitude)) |>
# mutate(oxy = oxy * 0.0223909) # conversion factor from mmol/m3 to ?
# # the hindcast data is averaged by month, so we do the same here even though it seems like its only one obs per month.
#
# oxy_df <- bind_rows(oxy_df1_rean, oxy_df1_fore) |>
# dplyr::select(month_year, latitude, longitude, oxy, day, month, year)
#
# # Temperature and salinity
#
# # source
# # forecast cmems_mod_bal_phy_anfc_P1M-m_1723620759845.nc
# # https://data.marine.copernicus.eu/product/BALTICSEA_ANALYSISFORECAST_PHY_003_006/download?dataset=cmems_mod_bal_phy_anfc_P1M-m_202311
# # reanalysis cmems_mod_bal_phy_my_P1M-m_1723620956814.nc
# # https://data.marine.copernicus.eu/product/BALTICSEA_MULTIYEAR_PHY_003_011/download
#
# # forecast
# tempsal_df1_fore <- tidync(paste0(home, "/data/NEMO/cmems_mod_bal_phy_anfc_P1M-m_1723620759845.nc")) |>
# hyper_tibble() |>
# mutate(date = as_datetime(time, origin = '1970-01-01'),
# month = month(date),
# day = day(date),
# year = year(date),
# month_year = paste(month, year, sep = "_")) |>
# mutate(sal = mean(sob, na.rm = TRUE), .by = c(month_year, latitude, longitude)) |>
# mutate(temp = mean(bottomT, na.rm = TRUE), .by = c(month_year, latitude, longitude))
#
# #sum(ncvar_get(ncin,"bottomT") == -999, na.rm=TRUE) # no fill values to replace
#
# # reanalysis
# tempsal_df1_rean <- tidync(paste0(home, "/data/NEMO/cmems_mod_bal_phy_my_P1M-m_1723620956814.nc")) |>
# hyper_tibble() |>
# mutate(date = as_datetime(time, origin = '1970-01-01'),
# month = month(date),
# day = day(date),
# year = year(date),
# month_year = paste(month, year, sep = "_")) |>
# mutate(sal = mean(sob, na.rm = TRUE), .by = c(month_year, latitude, longitude)) |>
# mutate(temp = mean(bottomT, na.rm = TRUE), .by = c(month_year, latitude, longitude))
#
# tempsal_df <- bind_rows(tempsal_df1_rean, tempsal_df1_fore) |>
# dplyr::select(month_year, latitude, longitude, sal, temp, day, month, year)
#
# # join env varibles
# copenv_df <-
# left_join(oxy_df, tempsal_df) |>
# rename(lat = latitude,
# lon = longitude)
#
# saveRDS(copenv_df, file = paste0(home, "/data/environment/copernicus_1993_2021.rds"))3. Combine hindcast and Copernicus data
Combine SMHI and Copernicus in a df
# load hindcast
hindenv_df <- readRDS(file = paste0(home, "/data/environment/hindcast_1961_2017.rds"))
hindenv_df_march <- hindenv_df |>
mutate(model = "hindcast",
model = as.factor(model))
# load copernicus
copenv_df <- readRDS(file = paste0(home, "/data/environment/copernicus_1993_2021.rds"))
copenv_df_march <- copenv_df |>
filter(between(lat, 54, 60) & between(lon, 12.5, 24.5)) |> # reduce extent based on coords in stomach data
mutate(model = "copernicus",
model = as.factor(model))
str(copenv_df_march)tibble [28,860,240 × 10] (S3: tbl_df/tbl/data.frame)
$ month_year: chr [1:28860240] "1_1993" "1_1993" "1_1993" "1_1993" ...
$ lat : num [1:28860240] 54 54 54 54 54 ...
$ lon : num [1:28860240] 13.8 13.8 13.8 13.9 13.9 ...
$ oxy : num [1:28860240] 9.01 8.95 8.93 8.81 8.79 ...
$ day : int [1:28860240] 1 1 1 1 1 1 1 1 1 1 ...
$ month : num [1:28860240] 1 1 1 1 1 1 1 1 1 1 ...
$ year : num [1:28860240] 1993 1993 1993 1993 1993 ...
$ sal : num [1:28860240] 5.22 4.98 5.04 5.15 5.34 ...
$ temp : num [1:28860240] 0.808 0.819 0.848 0.879 0.856 ...
$ model : Factor w/ 1 level "copernicus": 1 1 1 1 1 1 1 1 1 1 ...
str(hindenv_df_march)tibble [12,517,200 × 13] (S3: tbl_df/tbl/data.frame)
$ temp : num [1:12517200] 0.547 0.557 0.575 1.235 0.589 ...
$ sal : num [1:12517200] 8.16 8.21 8.24 7.88 8.31 ...
$ oxy : num [1:12517200] 9.47 9.48 9.48 9.21 9.47 ...
$ hypoxia1: num [1:12517200] 0 0 0 0 0 0 0 0 0 0 ...
$ hypoxia2: num [1:12517200] 0 0 0 0 0 0 0 0 0 0 ...
$ anoxia : num [1:12517200] 0 0 0 0 0 0 0 0 0 0 ...
$ depth : num [1:12517200] 7.54 7.54 7.54 7.54 7.54 ...
$ time : num [1:12517200] 1.93e+09 1.93e+09 1.93e+09 1.93e+09 1.93e+09 ...
$ lat : num [1:12517200] 54.2 54.2 54.2 54.2 54.2 ...
$ lon : num [1:12517200] 13.5 13.5 13.6 13.8 13.4 ...
$ year : int [1:12517200] 1961 1961 1961 1961 1961 1961 1961 1961 1961 1961 ...
$ month : int [1:12517200] 1 1 1 1 1 1 1 1 1 1 ...
$ model : Factor w/ 1 level "hindcast": 1 1 1 1 1 1 1 1 1 1 ...
# combine datasets
env_df_march <- hindenv_df_march |>
bind_rows(copenv_df_march ) |>
dplyr::select(lat, lon, oxy, temp, sal, month, year, model)
# NAs in sal and temp!
map(env_df_march, ~sum(is.na(.)))$lat
[1] 0
$lon
[1] 0
$oxy
[1] 0
$temp
[1] 318
$sal
[1] 208
$month
[1] 0
$year
[1] 0
$model
[1] 0
Plot environmental covariates for march and a few selected years
#oxy
env_df_march |>
filter(month == 3, #the most common month in the stomach data
lon > 13.5,
year %in% c(1993,1999,2007,2014)) |>
ggplot(aes(lon, lat, fill = oxy)) +
geom_raster() +
facet_wrap(model~year, nrow = 2)Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted
ℹ Consider using `geom_tile()` instead.
Raster pixels are placed at uneven horizontal intervals and will be shifted
ℹ Consider using `geom_tile()` instead.
#temp
env_df_march |>
filter(month == 3, #the most common month in the stomach data
lon > 13.5,
year %in% c(1993,1999,2007,2014)) |>
ggplot(aes(lon, lat, fill = temp)) +
geom_raster() +
facet_wrap(model~year, nrow = 2) +
scale_fill_viridis()Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted
ℹ Consider using `geom_tile()` instead.
Raster pixels are placed at uneven horizontal intervals and will be shifted
ℹ Consider using `geom_tile()` instead.
#sal
env_df_march |>
filter(month == 3, #the most common month in the stomach data
lon > 13.5,
year %in% c(1993,1999,2007,2014)) |>
ggplot(aes(lon, lat, fill = sal)) +
geom_raster() +
facet_wrap(model~year, nrow = 2) +
scale_fill_viridis(option="magma")Warning: Raster pixels are placed at uneven horizontal intervals and will be shifted
ℹ Consider using `geom_tile()` instead.
Raster pixels are placed at uneven horizontal intervals and will be shifted
ℹ Consider using `geom_tile()` instead.
4. Model spatial differences between Copernicus and SMHI-hindcast and predict the Hindcast for 2018-2024
Oxygen
To account for consistent differences between the Copernicus Baltic reanalysis and forecast and the SMHI Hindcast, we model the spatial and temporal variation between the models and predict the years that are missing from the Hindcast (2018-2024).
# There are differneces between the data but in these plots, the main diff is that the hindcast contains a larger range of negative values. The "dip" in the distrubution is still around 5.
env_df_march |>
slice_sample( n = 100000) |>
filter(model == "copernicus") |>
ggplot(aes(x=oxy)) +
geom_histogram() +
labs(title = "Copernicus") +
env_df_march |>
slice_sample( n = 100000) |>
filter(model == "hindcast") |>
ggplot(aes(x=oxy)) +
geom_histogram() +
labs(title = "hindcast")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# model data, filter and sample
env_df_march2 <- env_df_march |>
filter(year > 1992) |>
slice_sample( n = 100000) |> # sample data as it is too much data to model
mutate(yearmonth = ((year-1963)*12)+month) |> # a continuous time predictpr fpr the AR process
# group_by(model) |> # for equal sampling between groups
# sample_n(50000) |>
# ungroup() |>
add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633)
mesh <- make_mesh(env_df_march2, c("X", "Y"), cutoff = 8)This mesh has > 1000 vertices. Mesh complexity has the single largest influence
on fitting speed. Consider whether you require a mesh this complex, especially
for initial model exploration. Check `your_mesh$mesh$n` to view the number of
vertices.
plot(mesh)# spatial varying model intercept
Mod_oxy1 <-
sdmTMB(
data = env_df_march2 ,
formula = oxy ~ 0 + model + as.factor(year),
mesh = mesh,
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
# spatial grf and both year and month as factor predictors
Mod_oxy2 <-
sdmTMB(
data = env_df_march2,
formula = oxy ~ 0 + model + as.factor(year) + as.factor(month),
mesh = mesh,
spatial = "on",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
# spatial_varying effect of model and both year and month as factor predictors
Mod_oxy3 <-
sdmTMB(
data = env_df_march2,
formula = oxy ~ 0 + model + as.factor(year) + as.factor(month),
mesh = mesh,
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
# time_varying ar1 intercept (year and month) and spatial_varying effect of model
Mod_oxy4 <-
sdmTMB(
data = env_df_march2,
formula = oxy ~ 0 + model,
mesh = mesh,
time_varying_type = "ar1",
time_varying = ~1,
time = "yearmonth",
extra_time = c(719),
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
sanity(Mod_oxy1)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
sanity(Mod_oxy2)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
sanity(Mod_oxy3)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
sanity(Mod_oxy4)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
AIC(Mod_oxy1)[1] 299497.1
AIC(Mod_oxy2)[1] 288815.7
AIC(Mod_oxy3)[1] 277494.3
AIC(Mod_oxy4)[1] 277807.2
Model 3 and 4 has the lowest and similar AIC. Which one of these predicts oxygen best for years that are disclosed for the model (2007-2012 used below) and how do the hindcast predicted years 2018-2024 compare to the Copernicus forecast. Now we only use data from 2000 to speed things up bit.
# remove hindcast data for 2007-2012
mesh <- make_mesh(env_df_march2 |> filter(year > 2000) |> filter(!(year %in% 2007:2012 & model == "hindcast")), c("X", "Y"), cutoff = 8)This mesh has > 1000 vertices. Mesh complexity has the single largest influence
on fitting speed. Consider whether you require a mesh this complex, especially
for initial model exploration. Check `your_mesh$mesh$n` to view the number of
vertices.
Mod_oxy3b <-
sdmTMB(
data = env_df_march2 |> filter(year > 2000) |> filter(!(year %in% 2007:2012 & model == "hindcast")),
formula = oxy ~ 0 + model + as.factor(year) + as.factor(month),
mesh = mesh,
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
Mod_oxy4b <-
sdmTMB(
data = env_df_march2 |> filter(year > 2000) |> filter(!(year %in% 2007:2012 & model == "hindcast")),
formula = oxy ~ 0 + model, # + as.factor(year) + as.factor(month)
mesh = mesh,
time_varying_type = "ar1",
time_varying = ~1,
time = "yearmonth",
extra_time = c(719),
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
# Extend data with missing year for hindcast and predict on data.
# Model 3 newdata
env_df_march_predhind3 <- env_df_march |>
filter( model == "hindcast") |>
distinct(lat, lon) |>
replicate_df(time_name = "year", time_values = 2001:2023) |>
mutate( model = as_factor("hindcast"),
month = 3) |>
add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633)
oxy_predhind3 <- predict(Mod_oxy3b, newdata = env_df_march_predhind3)
# Model 4 newdata
env_df_march_predhind4 <- env_df_march |>
filter( model == "hindcast") |>
distinct(lat, lon) |>
replicate_df(time_name = "yearmonth", time_values = seq((2001-1963)*12+3,(2023-1963)*12+3, by=12)) |>
mutate( model = as_factor("hindcast")) |>
add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633)
oxy_predhind4 <- predict(Mod_oxy4b, newdata = env_df_march_predhind4)
# Plot compare model 3 and 4
bind_rows(oxy_predhind3 |> mutate(Mod ="Mod_oxy3"), oxy_predhind4 |> mutate(Mod = "Mod_oxy4", year = 1963+(yearmonth-3)/12)) |>
mutate(mean_est = mean(est), .by = c(year, Mod)) |>
filter(year > 1990) |>
ggplot() +
geom_line(aes(year, mean_est, color = as.factor(Mod))) +
geom_line(data = env_df_march |> filter(month == 3, year > 1990) |> summarise(oxy = mean(oxy), .by = c(year,model)), aes(x = year, y = oxy, linetype = model)) +
geom_vline( xintercept = c(2007,2012)) While model 3 and 4 differ in their predictions for 2007-2012 they are very similar. For years above 2018 they are even more similar.
Now we compare spatial differences model 3 and 4 for oxygen.
oxy_predb3 <- predict(Mod_oxy3)
oxy_predb4 <- predict(Mod_oxy4)
bind_rows(oxy_predb3 |> mutate(oxymod="Mod_oxy3"), oxy_predb4 |> mutate(oxymod="Mod_oxy4")) |>
mutate( diff = oxy - est, .by = c(year, model)) |>
mutate(decade = round(year/10) * 10) |>
summarise(decadaldiffs = mean(abs(diff)), .by = c(decade, oxymod)) |>
arrange(decade) |>
filter(decade > 1990)# A tibble: 6 × 3
decade oxymod decadaldiffs
<dbl> <chr> <dbl>
1 2000 Mod_oxy3 0.683
2 2000 Mod_oxy4 0.679
3 2010 Mod_oxy3 0.688
4 2010 Mod_oxy4 0.685
5 2020 Mod_oxy3 0.697
6 2020 Mod_oxy4 0.694
bind_rows(oxy_predb3 |> mutate(oxymod="Mod_oxy3"), oxy_predb4 |> mutate(oxymod="Mod_oxy4")) |>
mutate( diff = oxy - est, .by = c(year, model)) |>
mutate(decade = round(year/10) * 10) |>
filter( model == "hindcast",
decade > 1990) |>
ggplot(aes(lon,lat, color = diff)) +
geom_point(size=0.1) +
facet_wrap(decade~oxymod, ncol = 2) +
scale_colour_viridis(option= "turbo")Despite beeing very similar, the model with an AR1 process produces lower mean spatial difference. Visually they are indistinguishable I think. Ar1 is the winner as it has lower diffs and lower AIC.
# Selected model of oxygen
oxy_predhind4 <- predict(Mod_oxy4, newdata = env_df_march_predhind4)
# Plot oxygen model for all years (1963-2023)
oxy_predhind4 |> mutate(Mod = "Mod_oxy4", year = 1963+(yearmonth-3)/12) |>
mutate(mean_est = mean(est), .by = c(year)) |>
ggplot() +
geom_line(aes(year, mean_est, linetype = Mod)) +
geom_line(data = env_df_march |> filter(month == 3, year > 1990) |> summarise(oxy = mean(oxy), .by = c(year,model)), aes(x = year, y = oxy, color = model)) +
labs(title = "Oxygen model")Salinity
We skip corresponding oxygen model 2 for salinity here
env_df_march |>
filter(model == "copernicus") |>
ggplot(aes(x=sal)) +
geom_histogram() +
labs(title = "Copernicus") +
env_df_march |>
filter(model == "hindcast") |>
ggplot(aes(x=sal)) +
geom_histogram() +
labs(title = "Hindcast")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 208 rows containing non-finite outside the scale range
(`stat_bin()`).
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
env_df_march2 <- env_df_march |>
drop_na(sal) |>
slice_sample( n = 100000) |> # sample data as it is too much data to model
mutate(yearmonth = ((year-1963)*12)+month) |> # a continuous time predictpr fpr the AR process
# group_by(model) |>
# sample_n(50000) |>
# ungroup() |>
add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633)
summary(env_df_march2$sal) Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00054 7.11164 8.30024 8.86648 10.32366 35.08593
mesh <- make_mesh(env_df_march2, c("X", "Y"), cutoff = 8)This mesh has > 1000 vertices. Mesh complexity has the single largest influence
on fitting speed. Consider whether you require a mesh this complex, especially
for initial model exploration. Check `your_mesh$mesh$n` to view the number of
vertices.
# spatial varying model intercept
Mod_sal1 <-
sdmTMB(
data = env_df_march2,
formula = sal ~ 0 + model + as.factor(year),
mesh = mesh,
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
# spatial varying model intercept and year and month as factor predictors
Mod_sal3 <-
sdmTMB(
data = env_df_march2,
formula = sal ~ 0 + model + as.factor(year) + as.factor(month),
mesh = mesh,
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
# time_varying ar1 intercept (year and month) and spatial_varying effect of model
Mod_sal4 <-
sdmTMB(
data = env_df_march2,
formula = sal ~ 0 + model,
mesh = mesh,
time_varying_type = "ar1",
time_varying = ~1,
time = "yearmonth",
extra_time = c(719), # add the month that is missing
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
sanity(Mod_sal1)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
sanity(Mod_sal3)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
sanity(Mod_sal4)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
AIC(Mod_sal1)[1] 232665.2
AIC(Mod_sal3)[1] 232497
AIC(Mod_sal4)[1] 231746.6
The AR1 model is best for salinity as well based on AIC.
# extend data with missing year for hindcast and predict on data. The resulting df we will use to extract values to the pred grid instead of predicting on the pred grid.
sal_predhind <- predict(Mod_sal4, newdata = env_df_march_predhind4)
# Plot salinity model
sal_predhind |> mutate(Mod = "Mod_sal4", year = floor(1963+(yearmonth/12))) |>
mutate(mean_est = mean(est), .by = c(year)) |>
ggplot() +
geom_line(aes(year, mean_est, linetype = Mod)) +
geom_line(data = env_df_march |> filter(month == 3, year > 1990) |> summarise(sal = mean(sal), .by = c(year,model)), aes(x = year, y = sal, color = model)) +
labs(title = "Salinity model")Temperature
env_df_march |>
filter(model == "copernicus") |>
ggplot(aes(x=temp)) +
geom_histogram() +
labs(title = "Copernicus") +
env_df_march |>
filter(model == "hindcast") |>
ggplot(aes(x=temp)) +
geom_histogram() +
labs(title = "Hindcast")`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 318 rows containing non-finite outside the scale range
(`stat_bin()`).
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
env_df_march2 <- env_df_march |>
drop_na(temp) |>
slice_sample( n = 100000) |> # sample data as it is too much data to model
mutate(yearmonth = ((year-1963)*12)+month) |> # a continuous time predictor fpr the AR process
# group_by(model) |>
# sample_n(50000) |>
# ungroup() |>
add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633)
summary(env_df_march2$temp) Min. 1st Qu. Median Mean 3rd Qu. Max.
-0.4715 4.0471 5.1419 5.9490 6.8464 23.8515
mesh <- make_mesh(env_df_march2, c("X", "Y"), cutoff = 8)This mesh has > 1000 vertices. Mesh complexity has the single largest influence
on fitting speed. Consider whether you require a mesh this complex, especially
for initial model exploration. Check `your_mesh$mesh$n` to view the number of
vertices.
# spatial varying model intercept
Mod_temp1 <-
sdmTMB(
data = env_df_march2,
formula = temp ~ 0 + model + as.factor(year),
mesh = mesh,
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity")
)
# spatial_varying effect of model
Mod_temp3 <-
sdmTMB(
data = env_df_march2,
formula = temp ~ 0 + model + as.factor(year) + as.factor(month),
mesh = mesh,
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
# time_varying ar1 intercept (year and month) and spatial_varying effect of model
Mod_temp4 <-
sdmTMB(
data = env_df_march2,
formula = temp ~ 0 + model,
mesh = mesh,
time_varying_type = "ar1",
time_varying = ~1,
time = "yearmonth",
extra_time = c(719),
spatial_varying = ~ 0 + model,
spatial = "off",
spatiotemporal = "off",
family = gaussian(link = "identity"),
)
sanity(Mod_temp1)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
sanity(Mod_temp3)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
sanity(Mod_temp4)✔ Non-linear minimizer suggests successful convergence
✔ Hessian matrix is positive definite
✔ No extreme or very small eigenvalues detected
✔ No gradients with respect to fixed effects are >= 0.001
✔ No fixed-effect standard errors are NA
✔ No standard errors look unreasonably large
✔ No sigma parameters are < 0.01
✔ No sigma parameters are > 100
✔ Range parameter doesn't look unreasonably large
AIC(Mod_temp1)[1] 517427.3
AIC(Mod_temp3)[1] 481401.7
AIC(Mod_temp4)[1] 482498.5
The model with month and year as factor predictors is best for temperature based on AIC.
# Use the extended data pred_hind4 to predict missing years.
temp_predhind <- predict(Mod_temp3, newdata = env_df_march_predhind4 |> mutate(year = floor(1963+(yearmonth/12)), month = 3))
# Plot temoerature model
temp_predhind |>
mutate(Mod = "Mod_temp4",
mean_est = mean(est), .by = c(year)) |>
filter(year > 1990) |>
ggplot() +
geom_line(aes(year, mean_est, linetype = Mod)) +
geom_line(data = env_df_march |> filter(month == 3, year > 1990) |> summarise(temp = mean(temp), .by = c(year,model)), aes(x = year, y = temp, color = model)) +
labs(title = "Temperature model")5. Plot and save predictions for 2018-2024 with hindcast and plot
# combine oxy, salinity and temperature predictions
env_df_comb <-
oxy_predhind4 |>
mutate(oxy = est,
year = 1963+(yearmonth-3)/12) |>
dplyr::select("lat", "lon", "year", "oxy") |>
left_join(sal_predhind |> mutate(sal = est, year = 1963+(yearmonth-3)/12) |> dplyr::select("lat", "lon", "sal", "year")) |>
left_join(temp_predhind |> mutate(temp = est) |> dplyr::select("lat", "lon", "temp", "year")) |> filter(year > 1991) |>
bind_rows(env_df_march |> filter(year < 2018 & model == "hindcast")) |>
dplyr::select("year","lat", "lon", "oxy", "temp", "sal") |>
mutate(month = 3) |>
filter(lon > 13.5)Joining with `by = join_by(lat, lon, year)`
Joining with `by = join_by(lat, lon, year)`
# plot
plot_map_fc +
geom_point(data = env_df_comb |> filter(year == 2019) |> add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633), aes(X*1000, Y*1000, color = oxy), alpha = 0.5) +
theme_sleek(base_size = 6) +
geom_sf() +
labs(title = "2019") +
plot_map_fc +
geom_point(data = env_df_comb |> filter(year == 1963) |> add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633), aes(X*1000, Y*1000, color = sal), alpha = 0.5) +
theme_sleek(base_size = 6) +
geom_sf() +
plot_map_fc +
geom_point(data = env_df_comb |> filter(year == 1999) |> add_utm_columns(ll_names = c("lon", "lat"), utm_crs = 32633), aes(X*1000, Y*1000, color = temp), alpha = 0.5) +
theme_sleek(base_size = 6) +
geom_sf() Warning: Removed 6525 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 78300 rows containing missing values or values outside the scale range
(`geom_point()`).
Removed 78300 rows containing missing values or values outside the scale range
(`geom_point()`).
Save models and use in extracting data for stomach data and pred_grid (scrip 03 and 04)
#save
saveRDS(Mod_oxy4, paste0(home, "/R/prepare-data/Mod_oxy.rds"))
saveRDS(Mod_sal4, paste0(home, "/R/prepare-data/Mod_sal.rds"))
saveRDS(Mod_temp3, paste0(home, "/R/prepare-data/Mod_temp.rds"))